Code
library(tidyverse)
library(ggplot2)
library(lavaan)
library(car)
library(caret)
library(ranger)
library(doParallel)library(tidyverse)
library(ggplot2)
library(lavaan)
library(car)
library(caret)
library(ranger)
library(doParallel)This data set is from the 2015 Asian American Quality of Life survey. Participants are from Austin, Texas.
qol <- read_csv("AAQoL.csv") |> mutate(across(where(is.character), ~as.factor(.x))) |>
mutate(`English Difficulties`=relevel(`English Difficulties`,ref="Not at all"),
`English Speaking`=relevel(`English Speaking`,ref="Not at all"),
Ethnicity = relevel(Ethnicity,ref="Chinese"),
Religion=relevel(Religion,ref="None")) |>
mutate(Income_median = case_match(Income,"$0 - $9,999"~"Below",
"$10,000 - $19,999" ~"Below",
"$20,000 - $29,999"~"Below",
"$30,000 - $39,999"~"Below",
"$40,000 - $49,999"~"Below",
"$50,000 - $59,999"~"Below",
"$60,000 - $69,999"~"Above",
"$70,000 and over"~"Above",
.default=Income)) |>
mutate(Income_median = factor(Income_median, levels=c("Below","Above"))) |>
mutate(across(`Familiarity with America`:`Familiarity with Ethnic Origin`,~factor(.x,levels=c("Very low","Low", "High", "Very high"))),
across(`Identify Ethnically`,~factor(.x,levels=c("Not at all","Not very close","Somewhat close","Very close"))),
across(`Belonging`,~factor(.x,levels=c("Not at all","Not very much","Somewhat","Very much"))),
`Primary Language` = as.factor(`Primary Language`))New names:
Rows: 2609 Columns: 231
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(190): Gender, Ethnicity, Marital Status, No One, Spouse, Children, Gran... dbl
(41): Survey ID, Age, Education Completed, Household Size, Grandparent,...
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `Other` -> `Other...17`
• `Other` -> `Other...89`
qol |> DT::datatable()Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Physical Check-up`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
inTraining <- createDataPartition(rfdata$Physical.Check.up, p = .8, list = FALSE)
training <- rfdata[ inTraining,]
testing <- rfdata[-inTraining,]
fitControl <- trainControl(## 10-fold CV
method = "repeatedcv",
number = 10,
## repeated ten times
repeats = 10)
cl <- makePSOCKcluster(10)
registerDoParallel(cl)
set.seed(825)
rf_fit <- train(Physical.Check.up ~ ., data = training,
method = "rf",
trControl = fitControl,
## This last option is actually one
## for gbm() that passes through
verbose = FALSE)
stopCluster(cl)
varImp(rf_fit)rf variable importance
only 20 most important variables shown (out of 39)
Overall
Duration.of.Residency 100.000
Age 84.561
Health.InsuranceYes 36.124
Dental.InsuranceYes 35.967
Income_medianAbove 22.141
GenderMale 18.296
EmploymentEmployed full time 13.234
EnglishDiffNot much 12.709
Discrimination 12.332
Primary.Language1 11.846
Familiarity.with.Ethnic.OriginHigh 11.255
EnglishSpeakVery well 10.775
BelongingSomewhat 10.739
EthnicityKorean 10.614
Identify.EthnicallyVery close 10.303
Familiarity.with.Ethnic.OriginVery high 10.108
Familiarity.with.AmericaLow 10.000
Familiarity.with.AmericaHigh 9.953
EnglishSpeakWell 9.803
Identify.EthnicallySomewhat close 9.520
predict(rf_fit, newdata=testing) -> pc_pred
confusionMatrix(data=pc_pred,reference=testing$Physical.Check.up)Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 32 7
Yes 94 260
Accuracy : 0.743
95% CI : (0.6968, 0.7855)
No Information Rate : 0.6794
P-Value [Acc > NIR] : 0.003544
Kappa : 0.2785
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.25397
Specificity : 0.97378
Pos Pred Value : 0.82051
Neg Pred Value : 0.73446
Prevalence : 0.32061
Detection Rate : 0.08142
Detection Prevalence : 0.09924
Balanced Accuracy : 0.61388
'Positive' Class : 0
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Dentist Check-up`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
inTraining <- createDataPartition(rfdata$Dentist.Check.up, p = .8, list = FALSE)
training <- rfdata[ inTraining,]
testing <- rfdata[-inTraining,]
fitControl <- trainControl(## 10-fold CV
method = "repeatedcv",
number = 10,
## repeated ten times
repeats = 10)
cl <- makePSOCKcluster(10)
registerDoParallel(cl)
set.seed(825)
rf_fit <- train(Dentist.Check.up ~ ., data = training,
method = "rf",
trControl = fitControl,
## This last option is actually one
## for gbm() that passes through
verbose = FALSE)
stopCluster(cl)
varImp(rf_fit)rf variable importance
only 20 most important variables shown (out of 39)
Overall
Duration.of.Residency 100.000
Age 60.759
Dental.InsuranceYes 56.719
Income_medianAbove 25.238
Health.InsuranceYes 20.080
GenderMale 14.996
EthnicityAsian Indian 13.683
EmploymentEmployed full time 12.474
Primary.Language1 11.544
EnglishSpeakVery well 11.352
ReligionHindu 11.042
Discrimination 10.731
Familiarity.with.AmericaLow 10.302
EnglishDiffVery much 9.654
Identify.EthnicallySomewhat close 9.617
BelongingSomewhat 9.083
Familiarity.with.Ethnic.OriginHigh 8.995
ReligionCatholic 8.614
Familiarity.with.AmericaHigh 8.484
Familiarity.with.Ethnic.OriginVery high 8.464
predict(rf_fit, newdata=testing) -> pc_pred
confusionMatrix(data=pc_pred,reference=testing$Dentist.Check.up)Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 91 31
Yes 71 199
Accuracy : 0.7398
95% CI : (0.6934, 0.7826)
No Information Rate : 0.5867
P-Value [Acc > NIR] : 1.796e-10
Kappa : 0.4431
Mcnemar's Test P-Value : 0.0001127
Sensitivity : 0.5617
Specificity : 0.8652
Pos Pred Value : 0.7459
Neg Pred Value : 0.7370
Prevalence : 0.4133
Detection Rate : 0.2321
Detection Prevalence : 0.3112
Balanced Accuracy : 0.7135
'Positive' Class : 0
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Folkmedicine`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
inTraining <- createDataPartition(rfdata$Folkmedicine, p = .8, list = FALSE)
training <- rfdata[ inTraining,]
testing <- rfdata[-inTraining,]
fitControl <- trainControl(## 10-fold CV
method = "repeatedcv",
number = 10,
## repeated ten times
repeats = 10)
cl <- makePSOCKcluster(10)
registerDoParallel(cl)
set.seed(825)
rf_fit <- train(Folkmedicine ~ ., data = training,
method = "rf",
trControl = fitControl,
## This last option is actually one
## for gbm() that passes through
verbose = FALSE)
stopCluster(cl)
varImp(rf_fit)rf variable importance
only 20 most important variables shown (out of 39)
Overall
Age 100.00
Duration.of.Residency 90.04
ReligionProtestant 23.50
Discrimination 22.69
EthnicityKorean 20.57
GenderMale 20.00
Income_medianAbove 19.77
Dental.InsuranceYes 17.49
EnglishSpeakVery well 17.05
EmploymentEmployed full time 17.03
Familiarity.with.AmericaHigh 16.86
Familiarity.with.Ethnic.OriginHigh 16.65
EnglishSpeakWell 16.18
EnglishDiffNot much 15.64
Primary.Language1 15.41
Identify.EthnicallyVery close 15.26
BelongingSomewhat 15.22
Identify.EthnicallySomewhat close 15.03
EnglishDiffMuch 14.76
BelongingNot very much 14.17
predict(rf_fit, newdata=testing) -> pc_pred
confusionMatrix(data=pc_pred,reference=testing$Folkmedicine)Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 336 53
Yes 0 0
Accuracy : 0.8638
95% CI : (0.8256, 0.8962)
No Information Rate : 0.8638
P-Value [Acc > NIR] : 0.5365
Kappa : 0
Mcnemar's Test P-Value : 9.148e-13
Sensitivity : 1.0000
Specificity : 0.0000
Pos Pred Value : 0.8638
Neg Pred Value : NaN
Prevalence : 0.8638
Detection Rate : 0.8638
Detection Prevalence : 1.0000
Balanced Accuracy : 0.5000
'Positive' Class : 0